---------------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_TEST/AFQT_MATCHING_TEST_with_weights.log
  log type:  text
 opened on:  25 Nov 2013, 15:57:00

. 
. /***************************************************
> MATCHING AFQT SCORES ACROSS NLSY 1979 and NLSY 1997
> 
> This do file creates a data file with comparable AFQT scores across both NLSY79 and NLSY97.
> There are two main steps in creating the comparable AFQT scores:
> 
> 1. The 1979 ASVAB is a Paper and Pencil (P&P) test, while the 1997 ASVAB was computer adminstered. 
> To make the scores comparable across cohorts, we rely on a percentile mapping provided by Dan Segall (Segall (1997))
> 
> 2. The age at which respondents took the test differs between 1979 and 1997. The 1997 sample is much younger.
> For both samples, we observe a large sample of individuals taking the test at age 16. We use this overlap in the 
> test-taking age by mapping all test scores within cohorts into the age 16-distribution based on the within age 
> ranking of test scores. 
> 
> For details, see Segall (1997) and Altonji, Bharadwaj & Lange (2009).
> 
> Altonji, J., Bharadwaj, P. & Lange, F. "Changes in the Characteristics of American Youth - 
> Implications for Adult Outcomes" NBER Working Papers No. 13883, revised 2009.
> Segall, D. O. (1997). "Equating the CAT-ASVAB". In W. A. Sands, B. K. Waters, & J. R. McBride (Eds.), 
>         Computerized adaptive testing: From inquiry to operation (pp. 181-198). Washington, DC: American Psychological Associat
> ion. 
> Date: August 19, 2009.
> ******************************************************/
. 
. // Set path to directory containing afqt1997a.csv //
. capture cd "/afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_TEST"

. * cd "/Users/JKukkur/Documents/Research/ABL/AFQT MATCHING"
. 
. tempfile afqt97 afqt_append nlsy_agestd agestd_afqt missings finished_product

. 
. /****************************************************************
> First Step of score conversion: 
> Transfrom CAT Test Scores from NLSY97 into Paper and Pencil Test Scores using mapping provided by Dan Segall.
> Combine this data with raw data from NLSY79.
> *****************************************************************/
. 
. 
. // afqt1997a.csv contains individual id's and sex from NLSY1997 and the ASVAB component scores provided by Dan Segall. 
. // The are P&P equivalent scores based on the mapping procedure described in Segall (1996). 
. // Dan Segall suuplied us with these P&P equivalent scores using ASVAB component scores contained in (DICTIONARY FILE).
. insheet using afqt1997a.csv, comma
(36 vars, 8984 obs)

. ren v1 pid

. ren v2 male

. gen afqt80=ar+wk+pc+0.5*no if ar!=0 & wk!=0 & pc!=0 & no!=0 // pc=paragraph comprehension, no=numerical comprehension 
(1982 missing values generated)

. keep pid male afqt80

. sort pid

. save `afqt97', replace
(note: file /tmp/St15170.00000l not found)
file /tmp/St15170.00000l saved

. 
. // Merge age in for the NLSY97 sample //
. infile using age97.dct, clear

infile dictionary {
  R0000100 "PUBID - YTH ID CODE 1997"
  R1194100 "CV_AGE_INT_DATE 1997"
  R2553500 "CV_AGE_INT_DATE 1998"
  R3876300 "CV_AGE_INT_DATE 1999"
  R5453700 "CV_AGE_INT_DATE 2000"
  R7216000 "CV_AGE_INT_DATE 2001"
  S1531400 "CV_AGE_INT_DATE 2002"
  S2001000 "CV_AGE_INT_DATE 2003"
  S3801100 "CV_AGE_INT_DATE 2004"
}

(8984 observations read)

. ren R0000100    pid

. ren R1194100    age  // age as of 1997 (test-taking year for NLSY97) //

. keep pid age

. sort pid

. merge pid using `afqt97' 
(note: you are using old merge syntax; see [D] merge for new syntax)

. drop _merge 

. sort pid

. save `afqt97', replace
file /tmp/St15170.00000l saved

. 
. // Merge in weights for 1997 data //
. // We use the custom weight provided by the NLSY for the year 1997, the year when the ASVAB was administered. //
. insheet using weights97.csv, clear
(2 vars, 8984 obs)

. sort pid

. merge pid using `afqt97'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge 

. gen sample=1                            // Sample Identifier: 1= 1997 NLSY sample, 0=1979 NLSY sample //

. sort sample pid

. save `afqt97', replace
file /tmp/St15170.00000l saved

. 
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. infile using nlsy79_vars.dct, clear

infile dictionary {
  R0000100 "ID# (1-12686) 79"
  R0000500 "DATE OF BIRTH - YR 79"
  R0173600 "SAMPLE ID  79 INT"
  R0406510 "AGE OF R @ INT DATE 80"
  R0618011 "PROFILES ASVAB SEC 2-STD SCRNR 81"
  R0618012 "PROFILES ASVAB SEC 3-STD SCRNR 81"
  R0618013 "PROFILES ASVAB SEC 4-STD SCRNR 81"
  R0618014 "PROFILES ASVAB SEC 5-STD SCRNR 81"
}

(12686 observations read)

. ren R0000100 pid                

. ren R0406510 age                                                                        // Age as of 1980 (test taking year for
>  NLSY79) //

. ren R0000500 birthyear 

. ren R0173600 sampid

. ren R0618011 ar

. ren R0618012 wk

. ren R0618013 pc

. ren R0618014 no

. qui replace ar=. if ar<0

. qui replace wk=. if wk<0

. qui replace pc=. if pc<0

. qui replace no=. if no<0

. gen afqt80=ar+wk+pc+0.5*no if ar!=0 & wk!=0 & pc!=0 & no!=0
(808 missing values generated)

. drop if afqt80==.
(808 observations deleted)

. label var afqt80 "afqt according to 1980 definition"

. replace age=80-birthyear if age<0 & birthyear!=.        // Fill missing age using birth-year // 
(202 real changes made)

. drop birthyear

. gen sample=0                                                    // Sample Identifier: 1= 1997 NLSY sample, 0=1979 NLSY sample /
> /

. append using `afqt97'                           // Append the data-set for NLSY97 //

. sort sample pid

. save `afqt_append', replace
(note: file /tmp/St15170.00000m not found)
file /tmp/St15170.00000m saved

. 
. * Merge in 1979 weights
. // We use the custom weight provided by the NLSY for the year 1979, the year when the ASVAB was administered. //
. 
. insheet using weights79.csv, clear
(2 vars, 12686 obs)

. gen sample=0

. sort sample pid

. merge sample pid using `afqt_append'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge

. 
. * The weights have implied 2 decimal places.
. replace weight=weight/100
weight was long now double
(21670 real changes made)

. sort sample pid

. save `afqt_append', replace
file /tmp/St15170.00000m saved

. 
. /***************************************************************************
> Second Step: Percentile mapping of P&P test scores into age=16 distribution
> ****************************************************************************/
. 
. *****           GENERATING PERCENTILES OF SCORES BY AGE AND NLSY-SAMPLE *********
. use `afqt_append', clear

. drop ar wk pc no

. 
. // Drop those with missing AFQT scores
. drop if afqt80==.
(2790 observations deleted)

. 
. // For Table I in Introduction.doc
. bysort sample: tab age

---------------------------------------------------------------------------------------------------------------------------------
-> sample = 0

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         15 |        962        8.10        8.10
         16 |      1,511       12.72       20.82
         17 |      1,488       12.53       33.35
         18 |      1,432       12.06       45.40
         19 |      1,502       12.65       58.05
         20 |      1,558       13.12       71.17
         21 |      1,539       12.96       84.12
         22 |      1,529       12.87       96.99
         23 |        357        3.01      100.00
------------+-----------------------------------
      Total |     11,878      100.00

---------------------------------------------------------------------------------------------------------------------------------
-> sample = 1

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         12 |        944       13.48       13.48
         13 |      1,387       19.81       33.29
         14 |      1,460       20.85       54.14
         15 |      1,478       21.11       75.25
         16 |      1,303       18.61       93.86
         17 |        427        6.10       99.96
         18 |          3        0.04      100.00
------------+-----------------------------------
      Total |      7,002      100.00


. 
. 
. // For Figure 1 in NSLY79
. kdensity afqt80 if sample==0&age==16, addplot(kdensity afqt80 if sample==1&age==16, lpattern(_)) title(Figure 1: AFQT Scores at
>  Age 16) ///
> note("The NLSY79-scores are the P&P scores reported by the NLSY79." "The NLSY-97 scores are based on the CAT scores from NLSY97
>  and the equation by Segal (1997)." "Both populations are weighted to be population representative.") ///
> legend(label(1 "NLSY 1979") label(2 "NLSY 1997") cols(2)) saving(HistScores, replace)
(file HistScores.gph saved)

. 
. 
. // Combine sparsely populated age-groups in both samples with adjacent age-groups //
. replace age=22 if age==23 & sample==0
(357 real changes made)

. replace age=17 if age==18 & sample==1
(3 real changes made)

. 
. *******************************************************************
. *EXPANDING THE DATA AND CREATING PERCENTILES BY HAND
. // The following procedure improves the quality of the percentile mapping. 
. // The problem is that some observations 'belong' in several percentiles because they 
. // have large weights. Stata commands such as xtile will simply assign a unique percentile to these
. // observations. Instead, we need to account for the fact that these observations belong to several pctiles. 
. // This is achieved by expanding the data-set proportionally to the weights and then generating percentiles. 
. 
. *expanding each observation by its weight - an observation with a weight of 1100 is expanded into 11 observations. 
. gen percentile_rank=.
(18880 missing values generated)

. replace weight=round(weight/100)
(18880 real changes made)

. foreach num of numlist 0/160 {
  2.         qui expand `num' if weight==`num'
  3. }

. 
. *generating a unique rank within each sample and age
. bysort sample age: egen r=rank(afqt80), u

. gen pafqt=.
(468653 missing values generated)

. 
. * Divide the rank by number of individuals corresponding to the population of a given age and sample
. * to get the percentile of an individual.
. 
. gen holdey = 1/100

. bysort sample age: egen sumWgts=total(holdey)

. 
. *SAMPLE==0
. qui replace pafqt=round(r/sumWgts) if age==15 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==16 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==17 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==18 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==19 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==20 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==21 & sample==0

. qui replace pafqt=round(r/sumWgts) if age==22 & sample==0

. 
. *SAMPLE 1
. qui replace pafqt=round(r/sumWgts) if age==12 & sample==1

. qui replace pafqt=round(r/sumWgts) if age==13 & sample==1

. qui replace pafqt=round(r/sumWgts) if age==14 & sample==1

. qui replace pafqt=round(r/sumWgts) if age==15 & sample==1

. qui replace pafqt=round(r/sumWgts) if age==16 & sample==1

. qui replace pafqt=round(r/sumWgts) if age==17 & sample==1

. 
. *dropping the duplicates now
. egen tag=tag(sample pid)

. keep if tag==1
(449773 observations deleted)

. drop tag

. 
. sort sample pafqt

. save `nlsy_agestd', replace
(note: file /tmp/St15170.00000n not found)
file /tmp/St15170.00000n saved

. 
. *****************************************************************************
. 
. // Within sample, we map AFQT scores by age in the age=16
. // distribution. We therefore require mean AFQT-scores of age=16 by percentile in each sample. 
. // We need to generate these averages using the weights.
. bys sample pafqt: egen pop=sum(weight) if age==16
(16066 missing values generated)

. bys sample pafqt: egen tot_score=sum(afqt80*weight) if age==16  // Mean age=16 raw score for each percentile //
(16066 missing values generated)

. bys sample pafqt: gen mean=tot_score/pop if age==16
(16066 missing values generated)

. drop tot_score pop

. egen tag=tag(sample pafqt mean) if age==16                      // We need only 1 obs per sample and percentile //

. keep if tag==1
(18678 observations deleted)

. ren mean afqt_std

. keep sample afqt_std pafqt afqt80

. sort sample pafqt

. save `agestd_afqt', replace
(note: file /tmp/St15170.00000o not found)
file /tmp/St15170.00000o saved

. 
. use `nlsy_agestd', clear

. merge sample pafqt using `agestd_afqt' // Merge into each percentile the age=16 corresponding score //
(note: you are using old merge syntax; see [D] merge for new syntax)
variables sample pafqt do not uniquely identify observations in the master data

. drop _merge

. keep sample pid male pafqt afqt_std afqt80 weight age

. sort sample pid

. // The final data contains "afqt_std": which is the comparable score across the 2 NLSY samples //
. *keep if sample==1 // this drops the NLSY79 observations //
. ren pid ID

. ren weight weight_altonji

. gen year = 1979 if sample==0
(7002 missing values generated)

. replace year = 1997 if sample==1 
(7002 real changes made)

. *drop sample age afqt80 pafqt // drop the other variables since I don't use them in my research //
. preserve

.         use `afqt97', clear

.         keep pid male

.         ren pid ID

.         save `missings', replace
(note: file /tmp/St15170.00000p not found)
file /tmp/St15170.00000p saved

. restore

. 
. preserve

.         keep if sample==1
(11878 observations deleted)

.         merge 1:1 ID using `missings'

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,982
        from master                         0  (_merge==1)
        from using                      1,982  (_merge==2)

    matched                             7,002  (_merge==3)
    -----------------------------------------

.         tab _merge

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
         using only (2) |      1,982       22.06       22.06
            matched (3) |      7,002       77.94      100.00
------------------------+-----------------------------------
                  Total |      8,984      100.00

.         zscore afqt_std
z_afqt_std created with 1982 missing values

.         drop afqt_std

.         ren z_afqt_std afqt_std

.         keep if male==1
(4385 observations deleted)

.         sort ID

.         outsheet ID afqt_std using altonjiAFQT.csv, comma nol replace

. restore

. save afqt_adjusted_final, replace 
file afqt_adjusted_final.dta saved

. save `finished_product', replace
(note: file /tmp/St15170.00000q not found)
file /tmp/St15170.00000q saved

. 
. tabstat afqt_std, by(sample) stats(n min max mean median)

Summary for variables: afqt_std
     by categories of: sample 

  sample |         N       min       max      mean       p50
---------+--------------------------------------------------
       0 |     11878    86.475  218.2206  157.1244  158.0183
       1 |      7002  80.07746  219.2908  161.7048  167.4191
---------+--------------------------------------------------
   Total |     18880  80.07746  219.2908  158.8231  160.9533
------------------------------------------------------------

. 
. isid ID year

. 
. * Test: the afqt distributions across age within sample should now be identical. There will still be very small deviations
. * because of the coarseness of the above expansion, but we believe these to be third order. To allow this code to run 
. * rapidly, we tolerate these deviations. 
. 
. **bys sample age: sum afqt_std [fw=weight], d
. 
. use `finished_product', clear

. ** Generate a NLSY79-only supplement:
. drop if sample==1
(7002 observations deleted)

. save afqt_adjusted_final79, replace
file afqt_adjusted_final79.dta saved

. 
. use `finished_product', clear

. ** Generate a NLSY97-only supplement:
. drop if sample==0
(11878 observations deleted)

. save afqt_adjusted_final97, replace
file afqt_adjusted_final97.dta saved

. 
. log close
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_TEST/AFQT_MATCHING_TEST_with_weights.log
  log type:  text
 closed on:  25 Nov 2013, 15:57:11
---------------------------------------------------------------------------------------------------------------------------------
